import pandas as pd
import gc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from io import StringIO
import session_info
%matplotlib inline
import itertools
import bioframe as bf
import bioframe.vis
import dash_bio
gc.collect()
15
## read the data from Google Drive public link
## data = pd.read_csv('./datacp_upd_fxd_210822.csv.gz')
link1 = "https://drive.google.com/file/d/1oiMPamG0tZBbXTI8RuBSHEBZo7CtEaLP/view?usp=sharing"
link2 =f"https://drive.google.com/uc?export=download&confirm=t&id={link1.split('/')[-2]}"
response = requests.get(link2) ## get the gzipped CSV table
raw = response.raw
with open('./outdata.gz', 'wb') as out_file:
out_file.write( raw.read() ) ## write to file
## if something went wrong with your download you can get the data with link1
## read the data from downloaded gzipped CSV table
data = pd.read_csv('./outdata.gz')
<ipython-input-5-679bf3f10dcf>:1: DtypeWarning: Columns (20,21) have mixed types. Specify dtype option on import or set low_memory=False.
data = pd.read_csv('./outdata.gz')
data.head()
| qseqid | caption_x | pident | length | mismatch | gapopen | qstart | qend | sstart | send | ... | phylum | class | order | family | genus | species | data_type | project | prog | fxd_taxid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NODE_125_length_10672_cov_442.064990 | YP_009337182.1 | 32.860 | 1339 | 846 | 20 | 6629 | 2673 | 17 | 1322 | ... | Negarnaviricota | Monjiviricetes | Mononegavirales | Xinmoviridae | Anphevirus | Dipteran anphevirus | trx | PRJNA353242 | blastx | 1922872 |
| 1 | NODE_1_length_14482_cov_858.024121 | YP_009337182.1 | 31.780 | 1438 | 926 | 22 | 6608 | 2355 | 17 | 1419 | ... | Negarnaviricota | Monjiviricetes | Mononegavirales | Xinmoviridae | Anphevirus | Dipteran anphevirus | trx | PRJNA275431 | blastx | 1922872 |
| 2 | NODE_1_length_14486_cov_119.472039 | YP_009337182.1 | 31.780 | 1438 | 926 | 22 | 6605 | 2352 | 17 | 1419 | ... | Negarnaviricota | Monjiviricetes | Mononegavirales | Xinmoviridae | Anphevirus | Dipteran anphevirus | trx | PRJNA275662 | blastx | 1922872 |
| 3 | NODE_2_length_14498_cov_773.643564 | YP_009337182.1 | 31.780 | 1438 | 926 | 22 | 6616 | 2363 | 17 | 1419 | ... | Negarnaviricota | Monjiviricetes | Mononegavirales | Xinmoviridae | Anphevirus | Dipteran anphevirus | trx | PRJNA297027 | blastx | 1922872 |
| 4 | NODE_13_length_514_cov_145.931393 | YP_009337182.1 | 29.412 | 170 | 109 | 3 | 11 | 514 | 895 | 1055 | ... | Negarnaviricota | Monjiviricetes | Mononegavirales | Xinmoviridae | Anphevirus | Dipteran anphevirus | trx | PRJNA438159 | blastx | 1922872 |
5 rows × 28 columns
datacp_upd_fxd_eval03 = data[data.evalue < 10**-3]
cont_foo = datacp_upd_fxd_eval03[datacp_upd_fxd_eval03.kindom == 'Viruses']
## cont_foo is the table of contigs hits with Viral nucleotide sequences (with BLASTx)
## with e-values < 10**-3
## the function to merge the overlapping intervals -- to assess the overall alignment length
def get_cover(tdf):
return bf.merge( pd.DataFrame( {'chrom' : 'chr1',
'start' : tdf[['qstart', 'qend']].min(1), ## the minimal value stands for the start
'end' : tdf[['qstart', 'qend']].max(1) } ), ## and the maximal value stands for the end
min_dist=0)[['start', 'end']].dot(np.array([-1,1])).sum()
## we applied this aggregation since in general the lengths of viral genomes belonging to the same family are quite close
## here we calculated the total alignment lengths by viral family
cont_virfam_cover = cont_foo.groupby(['qseqid','family']).apply(get_cover).unstack()
## the clustermap of alignment lengths by contigs and viral families
## it was customized to drop the denrogram for contigs
## and to adjust the heght of the dendrogram for viral families
## as it was suggested in:
## https://www.data-to-viz.com/graph/dendrogram.html
cg = sns.clustermap(np.log( cont_virfam_cover.fillna(0).astype(int).iloc[:, 1:] + 1 ).drop_duplicates() , figsize = [20, 30],
dendrogram_ratio=(.01, .1) );
cg.ax_row_dendrogram.set_visible(False) #suppress row dendrogram
#cg.ax_col_dendrogram.set_visible(False) #suppress column dendrogram
cg.ax_cbar.set_visible(False);
cg.ax_heatmap.set_xlabel('')
cg.ax_heatmap.xaxis.tick_top() # x axis on top
cg.ax_heatmap.xaxis.set_label_position('top')
cg.ax_heatmap.set_xticklabels(cg.ax_heatmap.get_xticklabels(), rotation = 90);
plt.gcf().subplots_adjust(hspace=0.125, wspace=0.125);
## don't forget to collect the garbage from time to time
gc.collect()
15
data_array = np.log( cont_virfam_cover.fillna(0).astype(int).iloc[:, 1:] + 1 ).drop_duplicates()
dbg = dash_bio.Clustergram(
data=data_array,
column_labels=list(data_array.columns.values),
row_labels=list(data_array.index),
height=13000,
width=1000,
#colorbar = None,
optimal_leaf_order = True,
display_ratio=[0.001, 0.01],
link_method = 'single',
color_map= [
[0.0, '#636EFA'],
[0.25, '#AB63FA'],
[0.5, '#FFFFFF'],
[0.75, '#E763FA'],
[1.0, '#EF553B']
],
#config = {'style': {'font-size': 12} }
# return_computed_traces = True
)
## https://stackoverflow.com/questions/58630928
dbg.update_traces(dict(showscale=False), selector={'type':'heatmap'})
dbg.update_layout(font_family="Consolas",
font=dict(size= 8) )
session_info.show(excludes = ['pyarrow','distributed'])
----- bioframe 0.3.3 dash_bio 1.0.2 matplotlib 3.3.1 numpy 1.20.2 pandas 1.4.3 plotly 4.12.0 requests 2.25.1 seaborn 0.11.0 session_info 1.0.0 -----
Bio 1.78 GEOparse 2.0.3 PIL 8.2.0 asciitree NA attr 20.3.0 backcall 0.2.0 blinker 1.4 brotli NA certifi 2022.06.15 cffi 1.14.5 cftime 1.4.1 chardet 3.0.4 click 7.1.2 cloudpickle 1.6.0 colorama 0.4.4 colour NA cycler 0.10.0 cython_runtime NA cytoolz 0.11.0 dash 1.17.0 dash_renderer 1.8.3 dask 2020.12.0 dateutil 2.8.1 decorator 5.0.6 fastcluster 1.1.26 fasteners NA fiona 1.8.13 flask 1.1.2 flask_compress NA future 0.18.2 geopandas 0.8.1 google NA idna 2.8 ipykernel 5.3.4 ipython_genutils 0.2.0 ipywidgets 7.6.3 itsdangerous 1.1.0 jedi 0.17.0 jinja2 2.11.3 joblib 1.0.1 jsonschema 3.2.0 kiwisolver 1.3.1 lxml 4.6.3 markupsafe 1.1.1 mpl_toolkits NA msgpack 1.0.2 nbformat 5.1.3 netCDF4 1.5.6 nt NA ntsecuritycon NA numcodecs 0.7.3 numexpr 2.7.3 parmed 3.4.3 parso 0.8.2 periodictable 1.6.1 pickleshare 0.7.5 pkg_resources NA prompt_toolkit 3.0.17 psutil 5.8.0 pvectorc NA pygments 2.8.1 pyparsing 2.4.7 pyproj 2.6.1.post1 pyrsistent NA pythoncom NA pytz 2021.1 pywintypes NA retrying NA rtree 0.9.4 scipy 1.5.3 shapely 1.7.1 six 1.12.0 sklearn 0.23.2 socks 1.7.1 statsmodels 0.12.2 stdlib_list v0.8.0 storemagic NA tblib 1.7.0 tlz 0.11.0 toolz 0.11.1 tornado 6.1 tqdm 4.59.0 traitlets 5.0.5 typing_extensions NA urllib3 1.25.11 wcwidth 0.2.5 werkzeug 1.0.1 win32api NA win32com NA win32security NA xarray 0.17.0 yaml 5.4.1 zarr 2.7.0 zmq 20.0.0
----- IPython 7.19.0 jupyter_client 6.1.12 jupyter_core 4.7.1 notebook 6.3.0 ----- Python 3.8.8 (default, Feb 24 2021, 15:54:32) [MSC v.1928 64 bit (AMD64)] Windows-10-10.0.19041-SP0 ----- Session information updated at 2022-12-16 15:47